# @hidden_cell
# The project token is an authorization token that is used to access project resources like data sources, connections, and used by platform APIs.
from project_lib import Project
project = Project(project_id='...', project_access_token='...')


# Import required libraries
import itertools
import os
import requests
import tarfile
from collections import defaultdict

import string
import numpy as np
import pandas as pd

import nltk
nltk.download('punkt')
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.metrics import silhouette_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.cluster import homogeneity_score, completeness_score, v_measure_score

import matplotlib.pyplot as plt
import plotly.graph_objs as go


def get_file_handle(fname):
    # Project data path for the raw data file
    data_path = project.get_file(fname)
    data_path.seek(0)
    return data_path

# Get paths of the files and use them in appropriate place
data_paths = [get_file_handle(file) for file in ['unigrams.csv', 'bigrams.csv', 'ADJECTIVES.xlsx', 'SEMANTIC_CLASSES.xlsx']]


# Comments for testing
comments_5 = [
    'Customer service was polite.',
    'The socks are a pretty color but expensive.',
    'The shirt I bought was green and service was great.',
    'I think the sweater and socks were perfect.',
    'I do not like the shoes, so ugly and expensive.',
]


# Read unigram data
unigram_df = pd.read_csv(data_paths[0])
print(unigram_df.head())

# Create dict with unigram and sentiment
unigram_sentiment_dict = pd.Series(unigram_df.SENTIMENT_SCORE.values,
                                   index=unigram_df.UNIGRAM.values).to_dict()

   Unnamed: 0 UNIGRAM  SENTIMENT_SCORE  sentiment  uni_len first_letter
0           0      aa         0.019674          1        2            a
1           1     aaa         0.032775          1        3            a
2           2    aaas         0.074593          1        4            a
3           3  aachen         0.011926          1        6            a
4           4     aah         0.118070          1        3            a


# Simple implementation: if finds token in unigram_sentiment_dict, then adds sentiment
# The total sentiment for sentence is averaged
def calculate_unigram_sentiment(tokenized_sentence, sentiment_map=unigram_sentiment_dict):
    sentiment_score = 0
    for token in tokenized_sentence:
        token_sentiment = sentiment_map.get(token)
        if token_sentiment is None:
            continue
        else:
            sentiment_score += token_sentiment

    return sentiment_score


# Print out example sentences and their sentiment.
# The more negative the score, the more negative the sentiment.
# The more positive the score, the more positive the sentiment.
for sentence in comments_5:
    score = calculate_unigram_sentiment(word_tokenize(sentence.lower()))
    print('Score: {}, Sentence: {}'.format(score, sentence))

Score: 0.7981163, Sentence: Customer service was polite.
Score: -0.26573172, Sentence: The socks are a pretty color but expensive.
Score: 0.18411419999999995, Sentence: The shirt I bought was green and service was great.
Score: 0.16159839999999998, Sentence: I think the sweater and socks were perfect.
Score: -1.1203735, Sentence: I do not like the shoes, so ugly and expensive.


# Read bigram data
bigrams_df = pd.read_csv(data_paths[1])
bigrams_df.head()


bigram_sentiment_dict = pd.Series(bigrams_df.SENTIMENT_SCORE.values,
                                   index=bigrams_df.BIGRAM.str.split('-').apply(lambda l: tuple(l))).to_dict()


def calculate_bigram_sentiment_sentence(sentence_bigrams):
    bigrams = []
    bigram_sentiment_score = 0
    for bigram in sentence_bigrams:
        bigram_sentiment = calculate_bigram_sentiment(bigram)
        if bigram_sentiment is not None:
            bigram_sentiment_score += bigram_sentiment
            bigrams.append(bigram)
    
    return bigram_sentiment_score, bigrams


def calculate_bigram_sentiment(bigram):
    bigram_sentiment = bigram_sentiment_dict.get(bigram)
    if bigram_sentiment:
        # -0.02 and 0.02 allows for margin of error for neutrals
        if bigram_sentiment < -0.02:
            return -1
        elif bigram_sentiment > 0.02:
            return 1
        else:
            return 0
    return None


sentence = 'The high prices are ridiculous'
sentence2 = 'They do not accept payments from my credit card'
calculate_bigram_sentiment_sentence(list(nltk.bigrams(word_tokenize(sentence.lower()))))
calculate_bigram_sentiment_sentence(list(nltk.bigrams(word_tokenize(sentence2.lower()))))

(1, [('accept', 'payments')])


xls_file = pd.ExcelFile(data_paths[2])
adjective_expansion = pd.read_excel(xls_file, 'ADJECTIVE_EXPANSION').dropna(how='all').reset_index(drop=True)
high_low_PN = pd.read_excel(xls_file, '(HIGH,LOW)_POS_NEG', header=None)[0].values.tolist()
high_low_NP = pd.read_excel(xls_file, '(HIGH,LOW)_NEG_POS', header=None)[0].values.tolist()
fast_slow_PN = pd.read_excel(xls_file, '(FAST,SLOW)_POS_NEG', header=None)[0].values.tolist()
fast_slow_NP = pd.read_excel(xls_file, '(FAST,SLOW)_NEG_POS', header=None)[0].values.tolist()


def clean_adjective_df(df):
    adjectives = []
    for i in range(4):
        adjectives.extend(df.iloc[:, i+1].dropna().tolist())
    
    adj_category = df.iloc[0,0]
    
    return [(adj, adj_category) for adj in adjectives]


tokens = []
token_rows = 5
for i in range(0, len(adjective_expansion), token_rows+1):
    tokens.append(clean_adjective_df(adjective_expansion.loc[i:i+token_rows]))
high_tokens, low_tokens, fast_tokens, slow_tokens = tokens

adjective_class_map = dict(high_tokens + low_tokens + fast_tokens + slow_tokens)


set(adjective_class_map.values())

{'FAST_TOKENS', 'HIGH_TOKENS', 'LOW_TOKENS', 'SLOW_TOKENS'}


semantic_classes_file = pd.ExcelFile(data_paths[3])


dominator_neg = pd.read_excel(semantic_classes_file, 'DOMINATOR_NEG', header=None)[0].values.tolist()
dominator_pos = pd.read_excel(semantic_classes_file, 'DOMINATOR_POS', header=None)[0].values.tolist()
propagator_pos = pd.read_excel(semantic_classes_file, 'PROPAGATOR_POS', header=None)[0].values.tolist()
propagator_neg = pd.read_excel(semantic_classes_file, 'PROPAGATOR_NEG', header=None)[0].values.tolist()
reverser_pos = pd.read_excel(semantic_classes_file, 'REVERSER_POS', header=None)[0].values.tolist()
reverser_neg = pd.read_excel(semantic_classes_file, 'REVERSER_NEG', header=None)[0].values.tolist()


class Adjective:
    FAST = 'FAST_TOKENS'
    SLOW = 'SLOW_TOKENS'
    HIGH = 'HIGH_TOKENS'
    LOW = 'LOW_TOKENS'
    
class Sign:
    POSITIVE = 'Positive'
    NEGATIVE = 'Negative'
    NEUTRAL = 'Neutral'

adjective_conditions = {
    Adjective.FAST: [(fast_slow_PN, Sign.POSITIVE), (fast_slow_NP, Sign.NEGATIVE)],
    Adjective.SLOW: [(fast_slow_PN, Sign.NEGATIVE), (fast_slow_NP, Sign.POSITIVE)],
    Adjective.HIGH: [(high_low_PN, Sign.POSITIVE), (high_low_NP, Sign.NEGATIVE)],
    Adjective.LOW: [(high_low_PN, Sign.NEGATIVE), (high_low_NP, Sign.POSITIVE)],
}

sentiment_to_score = {
    Sign.POSITIVE: +1,
    Sign.NEGATIVE: -1,
}

def is_given_sentiment(sentiment, word, sentiment_map):
    if word in sentiment_map:
        if sentiment==Sign.NEGATIVE and sentiment_map[word] < 0:
            return True
        elif sentiment==Sign.POSITIVE and sentiment_map[word] > 0:
            return True

def calculate_composition_or_adj_sentiment(bigram):
    # Adjective
    adjective_token = adjective_class_map.get(bigram[0])
    if adjective_token is not None:
        for expansions_list, sentiment_sign in adjective_conditions[adjective_token]:
            if bigram[1] in expansions_list:
                return sentiment_to_score[sentiment_sign]

    # Composition: Reverser
    elif bigram[0] in reverser_pos and is_given_sentiment(Sign.NEGATIVE, bigram[1], unigram_sentiment_dict):
        return sentiment_to_score[Sign.POSITIVE]
    elif bigram[0] in reverser_neg and is_given_sentiment(Sign.POSITIVE, bigram[1], unigram_sentiment_dict):
        return sentiment_to_score[Sign.NEGATIVE]

    # Composition: Propagator
    elif bigram[0] in propagator_pos and is_given_sentiment(Sign.NEGATIVE, bigram[0], unigram_sentiment_dict) and is_given_sentiment(Sign.POSITIVE, bigram[1], unigram_sentiment_dict):
        return sentiment_to_score[Sign.POSITIVE]
    elif bigram[0] in propagator_neg and is_given_sentiment(Sign.POSITIVE, bigram[0], unigram_sentiment_dict) and is_given_sentiment(Sign.NEGATIVE, bigram[1], unigram_sentiment_dict):
        return sentiment_to_score[Sign.NEGATIVE]

    # Composition: Dominator
    elif bigram[0] in dominator_neg:
        return sentiment_to_score[Sign.NEGATIVE]
    elif bigram[0] in dominator_pos:
        return sentiment_to_score[Sign.POSITIVE]
    
    return None
    
        
def calculate_composition_or_adj_sentiment_sentence(sentence):
    sentiment_count = 0
    bigrams = []
    sentence_bigrams = list(nltk.bigrams(word_tokenize(sentence.lower())))
    for bigram in sentence_bigrams:
        sentiment = calculate_composition_or_adj_sentiment(bigram)
        if sentiment is not None:
            sentiment_count += sentiment
            bigrams.append(bigram)
        else:
            continue 
            
    # if sentiment_count > 0 then positive
    return sentiment_count, bigrams


calculate_composition_or_adj_sentiment_sentence("The high prices are ridiculous")

(-1, [('high', 'prices')])


def calculate_sentiment_combined(sentence):
    sentiment_score = 0
    table = str.maketrans(dict.fromkeys(string.punctuation))
    cleaned_sentence = sentence.translate(table)  # remove punctuation
    sentence_bigrams = list(nltk.bigrams(word_tokenize(cleaned_sentence.lower())))
    
    for bigram in sentence_bigrams:
        current_sentiment = calculate_bigram_sentiment(bigram)
        if current_sentiment is None:
            current_sentiment = calculate_composition_or_adj_sentiment(bigram)
            if current_sentiment is None:
                unigram_sentiment = calculate_unigram_sentiment(bigram)
                if unigram_sentiment < - 0.1:
                    current_sentiment = -1
                elif unigram_sentiment > 0.1:
                    current_sentiment = 1
                else:
                    current_sentiment = 0
        
        sentiment_score += current_sentiment
    
    return sentiment_score


extra_sentences = ['The high prices are ridiculous', 
                   'They do not accept payments from my credit card',
                   'I absolutely love how nice they are, I would definitely buy again from here.',
                   'I hate their products, so horrible, I cannot believe I spent so much money on shoes.',
                   'However, their webiste is beautiful and modern.',
                  ]
sentiment_sentences = comments_5 + extra_sentences

for sentence in sentiment_sentences:
    score = calculate_sentiment_combined(sentence)
    print('Score: {}, Sentence: {}'.format(score, sentence))

Score: 2, Sentence: Customer service was polite.
Score: -1, Sentence: The socks are a pretty color but expensive.
Score: -1, Sentence: The shirt I bought was green and service was great.
Score: -1, Sentence: I think the sweater and socks were perfect.
Score: -3, Sentence: I do not like the shoes, so ugly and expensive.
Score: -3, Sentence: The high prices are ridiculous
Score: 0, Sentence: They do not accept payments from my credit card
Score: 7, Sentence: I absolutely love how nice they are, I would definitely buy again from here.
Score: -2, Sentence: I hate their products, so horrible, I cannot believe I spent so much money on shoes.
Score: 3, Sentence: However, their webiste is beautiful and modern.


def convert_score_to_sentiment(score):
    if score < 0:
        return Sign.NEGATIVE
    elif score > 0:
        return Sign.POSITIVE
    else:
        return Sign.NEUTRAL
    

def calculate_sentence_level_sentiment(comment_by_sentence):
    """
    :param comment_by_sentence: comment broken down by sentence [sentence1, sentence2, ...]
                                each sentence is string.
    :return: (overall_score, [(sentiment, sentence), ...])
    """
    sentence_level_sentiment = []
    overall_score = 0
    for sentence in comment_by_sentence:
        score = calculate_sentiment_combined(sentence)
        overall_score += score
        sentiment_sentence_pair = (convert_score_to_sentiment(score), sentence)
        sentence_level_sentiment.append(sentiment_sentence_pair)
    return overall_score, sentence_level_sentiment
    

def group_comments_by_sentiment(comments):
    """
    :param comments: [comment, comment, ...]
    :return: {Pos: [[(+/-, sentence1), (+/-, sentence2), ...], [...], [...], ...]
              Neg: [], Neutral: [], }
    """
    overall_sentiment = defaultdict(list)
    for comment in comments:
        comment_by_sentence = nltk.tokenize.sent_tokenize(comment)
        overall_score, sentence_level_sentiment = calculate_sentence_level_sentiment(comment_by_sentence)
        overall_sentiment[convert_score_to_sentiment(overall_score)].append(sentence_level_sentiment)

    return overall_sentiment


comments = [
    'I bought several items: socks, shirt, sweater. By far my most favorite was the shirt because it is so soft. However, the sweater and socks missed the mark.',
    'My order arrived several days late. But when I contacted customer serivce they were very helpful and refunded me.',
    'Horrible, horrible customer service, I have never met such rude people. Why is it so bad? Would not recommend at all.',
    'Everything I ordered arrived on time and looked exactly like in the pictures! This company has high quality products.',
    'I bought somethings on sale, they were a great deal. Will be buying more next time.'
]

overall_sentiment = group_comments_by_sentiment(comments)
sign_word_to_symbol = {Sign.NEGATIVE: '-', Sign.POSITIVE: '+', Sign.NEUTRAL: '='}

# print out results in readible way
for sign in overall_sentiment:
    print('{}:'.format(sign))
    for comment in overall_sentiment[sign]:
        sentence_printout_text = ''
        for sentence_sign, sentence in comment:
            sentence_printout_text += '({}) {}\n'.format(sign_word_to_symbol[sentence_sign], sentence)
        print(sentence_printout_text)

Negative:
(-) I bought several items: socks, shirt, sweater.
(-) By far my most favorite was the shirt because it is so soft.
(-) However, the sweater and socks missed the mark.

(-) Horrible, horrible customer service, I have never met such rude people.
(-) Why is it so bad?
(-) Would not recommend at all.

Positive:
(-) My order arrived several days late.
(+) But when I contacted customer serivce they were very helpful and refunded me.

(+) I bought somethings on sale, they were a great deal.
(=) Will be buying more next time.

Neutral:
(-) Everything I ordered arrived on time and looked exactly like in the pictures!
(+) This company has high quality products.

	Unnamed: 0	BIGRAM	POS_TAGS	SENTIMENT_SCORE	first_letter
0	0	abalone-divers	NN-NNS	-0.090230	a
1	1	abandoned-animals	VBN-NNS	-0.089895	a
2	2	abandoned-apartment	VBN-NN	-0.126907	a
3	3	abandoned-attempts	VBN-NNS	-0.053709	a
4	4	abandoned-babies	VBN-NNS	-0.074742	a

Sentiment Analysis with IBM Debater Sentiment Composition Lexicons Dataset¶

Table of Contents¶

0. Prerequisites ¶

Insert a project token¶

1. Sentiment Analysis ¶

1.1 Get Data Files Paths ¶

1.2 Unigrams ¶

1.3 Bigrams ¶

1.4 Using Composition and Adjective Classes ¶

Composition and Adjective Class¶

ADJECTIVES.xlsx¶

SEMANTIC_CLASSES.xlsx:¶

1.4.1 Reading Adjective Classes Data Files¶

1.4.2 Reading Compostion Classes Data Files¶

1.4.3 Matching Adjective/Composition Classes¶

1.5 Combining Unigram, Bigram, Component/Adj Classes ¶

1.6 Group by Overall Sentiment ¶

Example¶

Summary ¶

Authors¶

Sentiment Analysis with IBM Debater Sentiment Composition Lexicons Dataset¶

Table of Contents¶

0. Prerequisites ¶

Insert a project token¶

1. Sentiment Analysis ¶

1.1 Get Data Files Paths ¶

1.2 Unigrams ¶

1.3 Bigrams ¶

1.4 Using Composition and Adjective Classes ¶

Composition and Adjective Class¶

ADJECTIVES.xlsx¶

SEMANTIC_CLASSES.xlsx:¶

1.4.1 Reading Adjective Classes Data Files¶

1.4.2 Reading Compostion Classes Data Files¶

1.4.3 Matching Adjective/Composition Classes¶

1.5 Combining Unigram, Bigram, Component/Adj Classes ¶

1.6 Group by Overall Sentiment ¶

Example¶

Summary¶

Authors¶

Summary ¶